R Sys.Date()samples_1 <- paste0("Eickelberg_",
c("B6B6_1",
"B6B6_2",
"B6B6_3",
"HLAB6_2",
"HLAB6_3"))
samples_2 <- c(
"1_B_65_11152018",
"1_B6_B6_11132018",
"2_HLA_5_11152018")
samples_3 <- c(
"NaiveB6",
"B6_B6_171501",
"HLA_B6_163107",
"HLA_B6_18467"
)
samples <- c(samples_1, samples_2, samples_3)
sample_paths <- file.path(data_dir,
samples,
"outs",
"filtered_feature_bc_matrix")
names(sample_paths) <- c(str_remove(samples_1, "Eickelberg_"),
"B6B6_5",
"B6B6_4",
"HLAB6_5",
"Naive_B6",
"B6B6_6",
"HLAB6_6",
"HLAB6_7")metrics_paths <- file.path(data_dir,
samples,
"outs",
"metrics_summary.csv")
names(metrics_paths) <- str_remove(samples, "Eickelberg_")
mapping_dat <- map_dfr(metrics_paths, read_csv, .id = "sample")## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_double(),
## `Number of Reads` = col_number(),
## `Valid Barcodes` = col_character(),
## `Sequencing Saturation` = col_character(),
## `Q30 Bases in Barcode` = col_character(),
## `Q30 Bases in RNA Read` = col_character(),
## `Q30 Bases in UMI` = col_character(),
## `Reads Mapped to Genome` = col_character(),
## `Reads Mapped Confidently to Genome` = col_character(),
## `Reads Mapped Confidently to Intergenic Regions` = col_character(),
## `Reads Mapped Confidently to Intronic Regions` = col_character(),
## `Reads Mapped Confidently to Exonic Regions` = col_character(),
## `Reads Mapped Confidently to Transcriptome` = col_character(),
## `Reads Mapped Antisense to Gene` = col_character(),
## `Fraction Reads in Cells` = col_character(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_number(),
## `Number of Reads` = col_number(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_number(),
## `Number of Reads` = col_number(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_number(),
## `Number of Reads` = col_number(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Estimated Number of Cells` = col_number(),
## `Mean Reads per Cell` = col_number(),
## `Median Genes per Cell` = col_number(),
## `Number of Reads` = col_number(),
## `Total Genes Detected` = col_number(),
## `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
clean_up_metadata <- function(metrics_summary) {
metrics_summary <- mutate_all(metrics_summary, str_remove, "%$")
metrics_summary <- mutate_at(metrics_summary, .vars= 2:ncol(metrics_summary), as.numeric)
metrics_summary
}
mapping_dat <- clean_up_metadata(mapping_dat)
metrics <- colnames(mapping_dat)[2:ncol(mapping_dat)]
mapping_dat <- mapping_dat %>%
gather(metric, value, -sample) %>%
mutate(sample = factor(sample, levels = unique(sample)))
p <- map(metrics,
function(x) {
filter(mapping_dat, metric == x) %>%
ggplot(aes(sample, value)) +
geom_point(aes(color = sample)) +
scale_color_brewer(palette = "Paired") +
labs(y = x,
x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
}) [[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
[[1]]
so <- CreateSeuratObject(
mat,
min.cells = 3,
min.features = 200,
names.delim = "_",
names.field = c(1,2)
)
so <- PercentageFeatureSet(so,
pattern = "^mt-",
col.name = "percent.mt")
so@meta.data <- so@meta.data %>%
tibble::rownames_to_column("cell") %>%
mutate(condition = str_split(orig.ident, "_", simplify = TRUE) %>% .[, 1] ) %>%
as.data.frame() %>%
tibble::column_to_rownames("cell")
dir.create(object_dir, showWarnings = FALSE)
saveRDS(so, file.path(object_dir, "unfiltered.rds"))plot_violin(so@meta.data,
"orig.ident",
"percent.mt",
.fill = "condition") +
labs(x = "", y = "Percent UMIs from Mitochondria")plot_violin(so@meta.data,
"orig.ident",
"nFeature_RNA",
.fill = "condition") +
labs(x = "", y = "# of genes per cell")sample_names <- as.character(unique(so@meta.data$orig.ident))
per_sample <- map(sample_names, ~filter(so@meta.data,
orig.ident == .x))
p <- list()
for(i in seq_along(per_sample)){
.col <- discrete_palette_default[i]
p[[i]] <- ggplot(per_sample[[i]], aes(nCount_RNA, percent.mt)) +
geom_point(aes(color = condition)) +
scale_color_manual(values = .col)
}
plot_grid(plotlist= p, nrow = 4, ncol = 3)No filtering applied at this point.
Annotate samples and assign colors to each sample.
annots <- so@meta.data %>%
select(orig.ident, condition) %>%
unique() %>%
arrange(condition)
subgroup_order <- c(
"B6B6",
"HLAB6",
"Naive"
)
annots <- mutate(annots, condition = factor(condition, levels = subgroup_order))
sample_order <- annots$orig.ident
so@meta.data$orig.ident <- factor(so@meta.data$orig.ident, levels = sample_order)
so@meta.data$condition <- factor(so@meta.data$condition, levels = subgroup_order)so <- NormalizeData(so)
so <- FindVariableFeatures(
so,
selection.method = "vst",
nfeatures = 2000,
verbose = FALSE
)
so <- ScaleData(so, verbose = TRUE)
so <- RunPCA(so,
npcs = 100,
verbose = FALSE)
ElbowPlot(so, ndims = 100)
# make graphs and use graphs for UMAP
so <- FindNeighbors(so,
reduction = "pca",
dims = 1:30,
k.param = 20L)
so <- RunUMAP(so,
graph = "RNA_snn",
min.dist = 0.3)
res <- c(0.05, 0.075, 0.1, 0.3, 0.5)
so <- FindClusters(so,
resolution = res)
map(str_c("RNA_snn_res.", res),
~plot_umap(so, .x)) %>%
plot_grid(plotlist = .,
nrow = 3,
ncol = 2)so$coarse_clusters <- so$RNA_snn_res.0.1
Idents(so) <- "coarse_clusters"
plot_umap(so, "coarse_clusters")## [[1]]
##
## [[2]]
##
## [[3]]
so <- readRDS(file.path(object_dir, "so.rds"))
Idents(so) <- "coarse_clusters"
mkrs <- FindAllMarkers(so, only.pos = TRUE)
write_tsv(mkrs,
file.path(mkrs_dir, "coarse_cluster_markers_all_data.tsv"))so <- readRDS(file.path(object_dir, "so.rds"))
mkrs <- read_tsv(file.path(mkrs_dir, "coarse_cluster_markers_all_data.tsv"))## Parsed with column specification:
## cols(
## p_val = col_double(),
## avg_logFC = col_double(),
## pct.1 = col_double(),
## pct.2 = col_double(),
## p_val_adj = col_double(),
## cluster = col_double(),
## gene = col_character()
## )
p <- DotPlot(so,
features = unique(topx$gene),
group.by = "coarse_clusters") +
coord_flip() +
labs(x = "Cluster",
y = "")
pThe Tabula Muris is a collection of single cell dataset from 20 organs.
We will use the gene expression profiles from these datasets to try to assign cell types to the identified clusters. The clustifyr package will correlate experession in our clusters to the tabula muris and report the most correlated cell type. It’s unlikely that these will be 100% correct, because I don’t believe that the tabula muris will have all of the cell types in a mouse, and I don’t believe hat they have cells from developing tissues. Nevertheless it will give us a good starting point to annotating the cell types.
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.0.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
## ========================================
so <- readRDS(file.path(object_dir, "so.rds"))
tm_average <- readRDS("/Users/kriemo/Projects/sc_repos/vanotterloo/data/tabula_muris/TM_avg_expr.rds")
mdata <- get_metadata(so)
res <- clustify(so@assays$RNA@data,
tm_average,
query_genes = so@assays$RNA@var.features,
metadata = mdata,
cluster_col = "coarse_clusters",
compute_method = "spearman")
hmap <- Heatmap(t(res),
viridis::viridis(256),
"Spearman\ncorrelation",
row_title = "Cell types from Tabula Muris",
column_title = "Clusters")
pdf(file.path(fig_dir, "cell_type_heatmap.pdf"), height = 12, width = 12)
draw(hmap)
dev.off()## quartz_off_screen
## 2
Reuse existing annotations
so2 <- readRDS(file.path("objects_2", "so.rds"))
so2_mdata <- get_metadata(so2)
rm(so2)
gc()
so_mdata <- get_metadata(so) %>%
select(-(PC_1:UMAP_2))
so2_mdata_simple <- so2_mdata %>%
select(cell, cell_types)
combined_mdata <- left_join(so_mdata, so2_mdata_simple, by = "cell")
best_cell_type <- combined_mdata %>%
group_by(cell_types, coarse_clusters) %>%
summarize(n = n()) %>%
group_by(cell_types) %>%
summarize(coarse_clusters = nth(coarse_clusters, which(n == max(n))))
so@meta.data <- so_mdata %>%
left_join(best_cell_type, by = "coarse_clusters") %>%
column_to_rownames("cell")library(SoupX)
dataDirs <- file.path(data_dir, samples, "outs")
scl <- load10X(dataDirs)
scl <- inferNonExpressedGenes(scl)
map(str_c("Channel", 1:12),
~rownames(scl$channels[[.x]]$nonExpressedGenes)[seq(50)]
) %>%
unlist() %>%
unique() %>%
.[!str_detect(., "^Rp[ls]")] %>%
.[!str_detect(., "^mt")]
soup_genes <- list(
HB = c("Hbb-bs", "Hba-a1", "Hba-a2", "Hbb-bt", "H2-Ab1", "H2-Aa", "H2-Ab1", "H2-Eb1"),
Club = c("Scgb1a1"),
Bcell = c("Cd74", "Ighg2c", "Igkc", "Ighm", "Ighg3"),
AEC2 = c("Sftpc"),
MAC = c("Apoe", "Lyz2", "Psap" ),
OTHER = c("Ly6c1", "Ly6a" )
)
for(channel in str_c("Channel", 1:12)){
message(channel)
scl <- calculateContaminationFraction(scl, channel, soup_genes)
}
for(channel in str_c("Channel", 1:12)){
message(channel)
gg = plotChannelContamination(scl, channel)
plot(gg)
}
for(channel in str_c("Channel", 1:12)){
message(channel)
scl = interpolateCellContamination(scl, channel, useGlobal = FALSE)
}
scl <- strainCells(scl)
scl <- adjustCounts(scl)
rename_cols <- names(sample_paths)
names(rename_cols) <- str_c("Channel", 1:12)
count_matrix <- scl$atoc
channels <- colnames(count_matrix) %>%
str_match("Channel[0-9]+") %>%
.[, 1]
new_ids <- rename_cols[channels]
cell_ids <- str_remove(colnames(count_matrix),
"Channel[0-9]+___")
new_ids <- str_c(new_ids, "_", cell_ids) %>%
str_remove("-[0-9]+$")
colnames(count_matrix) <- new_idscount_matrix <- count_matrix[rownames(so), colnames(so)]
cell_types <- so$cell_types
so <- CreateSeuratObject(counts = count_matrix ,
names.field = c(1, 2))
so$cell_types <- cell_types
so$condition <- str_split(so$orig.ident, "_", simplify = TRUE) %>% .[, 1]
so <- NormalizeData(so)
so <- FindVariableFeatures(
so,
selection.method = "vst",
nfeatures = 2000,
verbose = FALSE
)
so <- ScaleData(so, verbose = TRUE)
so <- RunPCA(so,
npcs = 100,
verbose = FALSE)
ElbowPlot(so, ndims = 100)
# make graphs and use graphs for UMAP
so <- FindNeighbors(so,
reduction = "pca",
dims = 1:30,
k.param = 20L)
so <- RunUMAP(so,
graph = "RNA_snn",
min.dist = 0.3)
res <- c(0.05, 0.075, 0.1, 0.3, 0.5)
so <- FindClusters(so,
resolution = res)
map(str_c("RNA_snn_res.", res),
~plot_umap(so, .x)) %>%
plot_grid(plotlist = .,
nrow = 3,
ncol = 2)so$coarse_clusters <- so$RNA_snn_res.0.1
Idents(so) <- "coarse_clusters"
plot_umap(so, "coarse_clusters")
plot_umap(so, "cell_types")
plot_umap(so, "condition")to_plot <- c(
"coarse_clusters",
"cell_types",
"condition",
"orig.ident",
"Mzb1",
"Cd19"
)
plt <- map(to_plot,
~plot_umap(so, .x)) %>%
plot_grid(plotlist = ., nrow = 3, ncol = 2,
rel_widths = c(1.2, 2, 1.2, 1.2, 1.2, 1.2))
save_plot(file.path(fig_dir,
paste0("umap_summary_soupx.pdf")),
plt,
nrow = 3,
ncol = 2,
base_asp = 1.2)vdj_subdirs <- c(
"NaiveB6_Bcell",
"B6_B6_171501_Bcell",
"HLA_B6_163107_Bcell",
"HLA_B6_168467_Bcell")
vdj_dirs <- file.path("/Users/kriemo/Projects/sc_repos/eickelberg/data/vdj",
vdj_subdirs,
"outs")
prefixes <- str_c(names(sample_paths)[9:12], "_")
out <- map2_dfr(vdj_dirs, prefixes, get_clonotypes)
so_tmp <- add_clonotypes(so,
vdj_dirs,
prefixes = prefixes)
so_tmp$frequency <- ifelse(is.na(so_tmp$frequency),
0L,
so_tmp$frequency)
so_tmp$proportion <- ifelse(is.na(so_tmp$proportion),
0L,
so_tmp$proportion)
so_tmp$clonotype_present <- ifelse(is.na(so_tmp$cdr3s_aa),
"None",
"Present")
c_mat <- so_tmp@meta.data %>%
tibble::rownames_to_column("cell") %>%
mutate(clonotype_id = str_c(str_sub(orig.ident, -1),
"-",
raw_clonotype_id)) %>%
select(cell, clonotype_id) %>%
mutate(present = 1L,
clonotype_id = ifelse(is.na(clonotype_id),
"no_clonotype",
clonotype_id),
clonotype_id = ifelse(clonotype_id == "None",
"no_clonotype",
clonotype_id)) %>% # need to figure out where None came from...
spread(cell, present, fill = 0L) %>%
tibble::column_to_rownames("clonotype_id") %>%
as.data.frame() %>%
as.matrix()
c_mat <- c_mat[, colnames(so_tmp@assays$RNA@data)]
c_mat <- as(c_mat, "dgCMatrix")
so_tmp@assays[["tcr"]] <- CreateAssayObject(counts = c_mat)